Preparations

Load libraries and functions

library("cluster")
library("dendextend")
## 
## ---------------------
## Welcome to dendextend version 1.12.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
source("functions.R")
## Loading required package: ggplot2

Corpus description and selection

Load data

# Get data with Stylo
# data = stylo::load.corpus.and.parse(corpus.dir = "~/dev/dh-meier/output/kraken-nospace/tokenized/boudams", features = "w", ngram.size = 1, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/kraken_nospace_expanded_words.csv")
data = read.csv("data/kraken_nospace_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)

Text lengths

nwords = colSums(data)
summary(nwords)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     296    2274    3598    5051    6824   19139
boxplot(nwords)
boxplot(nwords)$out

## 05_Ano_Leg-A_Ap_NA_Vie_Jacques  29_Wau_Leg-C_Co_Ev_Vie_Martin 
##                          14476                          14639 
## 31_Wau_Leg-C_Co_Ev_Dia_Martin3 34_Wau_Leg-C_Co_Ev_Vie_Martial 
##                          19139                          15407
head(sort(nwords), n = 15)
##          62_Ano_Leg-N_NA_NA_NA_Index          03_Ano_Leg-A_Ap_NA_Mar_Jean 
##                                  296                                  301 
##       61_Ano_Leg-B_NA_NA_NA_Jugement       30_Wau_Leg-C_Co_Ev_Tra_Martin2 
##                                  412                                  726 
##      08_Ano_Leg-A_Ap_NA_Vie_Philippe     59_Ano_Leg-C_Vi_NA_Vie_Euphrasie 
##                                 1017                                 1307 
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur         32_Wau_Leg-C_Co_Ev_Vie_Brice 
##                                 1375                                 1394 
##    60_Ano_Leg-B_NA_NA_NA_Antechriste       54_Ano_Leg-C_Vi_NA_Vie_Pelagie 
##                                 1493                                 1524 
##      20_Ano_Leg-B_Ma_Ho_Vie_Felicite          11_Ano_Leg-A_Ap_NA_Vie_Marc 
##                                 1695                                 1856 
##         23_Ano_Leg-B_Ma_Ho_Vie_Sixte    53_Ano_Leg-C_Vi_NA_Vie_Marguerite 
##                                 1953                                 1961 
##       35_Wau_Leg-C_Co_Ev_Vie_Nicolas 
##                                 1974
toKeep = colnames(data)[nwords > 1000]

toKeep = toKeep[grep("Bestiaire", toKeep, invert = TRUE)]

df = as.data.frame(nwords)

ggplot(df, aes(x="", y=nwords)) + geom_violin() + geom_boxplot(width=0.3) +  theme(axis.text.y = element_text(size = rel(1.4)), axis.title = element_text(size = rel(1.4))) + xlab("Est. length in words of corpus texts") + scale_y_continuous(breaks=c(0, 2500, 5000, 7500, 10000, 12500, 15000, 17500))

Transkribus raw data

3-grams from raw data

Load data

# Get data with Stylo
#data = stylo::load.corpus.and.parse(corpus.dir = "~/dev/dh-meier/output/kraken-nospace/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/kraken_nospace_raw_char3grams.csv")
data = read.csv("data/kraken_nospace_raw_char3grams.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
Raw3grSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHRaw3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotRaw3grams = cahPlotCol(myCAH, k = 9, main = "Characters 3-grams from raw data (Transkr)")

somCAH = somCluster(d)
somplotRaw3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Characters 3-grams from raw data (Transkr)")

Transkribus expanded data

Load data

data = read.csv("data/kraken_nospace_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]

Forms from expanded data

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
WordsSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHForms = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotForms = cahPlotCol(myCAH, k = 9, main = "Expanded word forms (Transkr/Boudams/Pie)")

somCAH = somCluster(d)
somplotForms = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded word forms (Transkr/Boudams/Pie)")

Affixes from expanded data

# Creating affixes database from all words
dataAffs = countAffixes(data)

Burrows + vector-length norm

d = dataAffs
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
AffixesSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHAffs = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotAffixes = cahPlotCol(myCAH, k = 9, main = "Expanded affixes (Transkr/Boudams/Pie)")
somCAH = somCluster(d)
somplotAffixes = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded affixes (Transkr/Boudams/Pie)")

Unstandardised function words from expanded data

Create function words list

#labels(sort(rowSums(data), decreasing = TRUE)[1:300])
# Avec ou sans pronoms ?
functionWords = source("functionWords.R")$value

Burrows + vector-length norm

d = relativeFreqs(data)
d = d[functionWords,]
# save data for robustness checks
FWSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFW = myCAH
# barplot(sort(myCAH$height))
plotFW = cahPlotCol(myCAH, k = 8, main = "Function words with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFW = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words")

Transkribus with linguistic annotation

POS 3-grams

data = read.csv("data/kraken_nospace_pos3-gr.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
POS3grSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHPOS3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotPOS3grams = cahPlotCol(myCAH, k = 9, main = "POS 3-grams (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotPOS3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - POS 3-grams")

Lemmas

data = read.csv("data/kraken_nospace_lemmas.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
LemmasSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotLemmas = cahPlotCol(myCAH, k = 9, main = "Lemmas (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotLemmas = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Lemmas")

Function words from lemmas

# Find function words
#rownames(data)[1:250]
functionLemmas = source("functionLemmas.R")$value

Burrows + vector-length norm

d = relativeFreqs(data)
d = d[functionLemmas,]
FLSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFL = myCAH
# barplot(sort(myCAH$height))
data = stylo::load.corpus.and.parse(corpus.dir = "~/dev/dh-meier/output/kraken-nospace/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
## loading 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1.txt   ...
## loading 01_Ano_Leg-A_Ap_NA_Vie_Pierre2.txt   ...
## loading 02_Ano_Leg-A_Ap_NA_Pas_Paul.txt  ...
## loading 03_Ano_Leg-A_Ap_NA_Mar_Jean.txt  ...
## loading 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev.txt   ...
## loading 05_Ano_Leg-A_Ap_NA_Vie_Jacques.txt   ...
## loading 06_Ano_Leg-A_Ap_NA_Vie_Matthieu.txt  ...
## loading 07_Ano_Leg-A_Ap_NA_Vie_SimonJude.txt ...
## loading 08_Ano_Leg-A_Ap_NA_Vie_Philippe.txt  ...
## loading 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur.txt ...
## loading 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy.txt    ...
## loading 11_Ano_Leg-A_Ap_NA_Vie_Marc.txt  ...
## loading 12_Ano_Leg-A_Ma_Ho_Vie_Longin.txt    ...
## loading 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien.txt ...
## loading 14_Ano_Leg-B_Ma_Ho_Vie_Vincent.txt   ...
## loading 15_Ano_Leg-B_Ma_Ho_Vie_Georges.txt   ...
## loading 16_Ano_Leg-B_Ma_Ho_Vie_Christophe.txt    ...
## loading 17_Ano_Leg-B_Ma_Ho_Vie_Agathe.txt    ...
## loading 18_Ano_Leg-B_Ma_Ho_Vie_Luce.txt  ...
## loading 19_Ano_Leg-B_Ma_Ho_Vie_Agnes.txt ...
## loading 20_Ano_Leg-B_Ma_Ho_Vie_Felicite.txt  ...
## loading 21_Ano_Leg-B_Ma_Ho_Vie_Christine.txt ...
## loading 22_Ano_Leg-B_Ma_Ho_Vie_Cecile.txt    ...
## loading 23_Ano_Leg-B_Ma_Ho_Vie_Sixte.txt ...
## loading 24_Ano_Leg-B_Ma_Ho_Vie_Laurent.txt   ...
## loading 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte.txt ...
## loading 26_Ano_Leg-B_Ma_Ev_Vie_Lambert.txt   ...
## loading 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon.txt ...
## loading 28_Ano_Leg-B_Ma_Ho_Vie_Clement.txt   ...
## loading 29_Wau_Leg-C_Co_Ev_Vie_Martin.txt    ...
## loading 30_Wau_Leg-C_Co_Ev_Tra_Martin2.txt   ...
## loading 31_Wau_Leg-C_Co_Ev_Dia_Martin3.txt   ...
## loading 32_Wau_Leg-C_Co_Ev_Vie_Brice.txt ...
## loading 33_Wau_Leg-C_Co_Er_Vie_Gilles.txt    ...
## loading 34_Wau_Leg-C_Co_Ev_Vie_Martial.txt   ...
## loading 35_Wau_Leg-C_Co_Ev_Vie_Nicolas.txt   ...
## loading 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2.txt  ...
## loading 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3.txt  ...
## loading 38_Wau_Leg-C_Co_Ev_Vie_Jerome.txt    ...
## loading 39_Wau_Leg-C_Co_Ev_Vie_Benoit.txt    ...
## loading 40_Wau_Leg-C_Co_Er_Vie_Alexis.txt    ...
## loading 41_Ano_Leg-C_Vi_NA_Vie_Irene.txt ...
## loading 42_Ano_Leg-B_Vi_NA_Ass_NotreDame.txt ...
## loading 43_Ano_Leg-C_Vi_NA_Vie_Catherine.txt ...
## loading 44_Ano_Leg-C_Ap_NA_Vie_Andre.txt ...
## loading 45_Ano_Leg-C_Ap_NA_Pas_Andre2.txt    ...
## loading 46_Ano_Leg-B_Co_NA_Pur_Patrice.txt   ...
## loading 47_Ano_Leg-C_Co_er_Vie_PaulErmite.txt    ...
## loading 48_Ano_Leg-C_Co_ev_Tra_Benoit2.txt   ...
## loading 49_Ano_Leg-C_NA_NA_Vie_Maur.txt  ...
## loading 50_Ano_Leg-C_NA_NA_Vie_Placide.txt   ...
## loading 51_Ano_Leg-C_Ma_ho_Vie_Eustache.txt  ...
## loading 52_Ano_Leg-C_Co_NA_Vie_Forsin.txt    ...
## loading 53_Ano_Leg-C_Vi_NA_Vie_Marguerite.txt    ...
## loading 54_Ano_Leg-C_Vi_NA_Vie_Pelagie.txt   ...
## loading 55_Ano_Leg-C_Co_NA_Vie_Simeon.txt    ...
## loading 56_Ano_Leg-C_Co_NA_Vie_Mamertin.txt  ...
## loading 57_Ano_Leg-C_Vi_NA_Vie_Julien.txt    ...
## loading 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne.txt   ...
## loading 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie.txt ...
## loading 60_Ano_Leg-B_NA_NA_NA_Antechriste.txt    ...
## loading 61_Ano_Leg-B_NA_NA_NA_Jugement.txt   ...
## loading 62_Ano_Leg-N_NA_NA_NA_Index.txt  ...
## loading 63_Ric_Leg-N_NA_NA_NA_Bestiaire.txt  ...
## loading 64_Ano_Leg-N_NA_NA_NA_Bestiaire2.txt ...
## slicing input text into tokens...
## 
## turning words into features, e.g. char n-grams (if applicable)...
plotFL = cahPlotCol(myCAH, k = 8, main = "Function Lemmas with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFL = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words (lemmas)")

Affixes + POS 3-gr + Function words (lemmas)

data = rbind(AffixesSave, POS3grSave, FLSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob = cahPlotCol(myCAH, k = 9, main = "Affixes + POS 3- grams + Function words (lemmas)")
somCAH = somCluster(d)
somplotGlob = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Affixes + POS 3- grams + Function words (lemmas)")

Plots

Analyses

#featlabel = "features of ME ±2σ with conf. > 90%"
#A = cahPlotCol(CAHLemma, main = "A", xlab = paste( ncol(CAHLemma$data), featlabel), k = 6, lrect = -12)
# B = cahPlotCol(CAHRhyme, main = "B", xlab = paste( ncol(CAHRhyme$data), featlabel), k = 6, lrect = -7, ylab = " ")
# C = cahPlotCol(CAHAllWords, main = "C", xlab = paste( ncol(CAHAllWords$data), featlabel), k = 6, ylab = " ")
# D = cahPlotCol(CAHAffs, main = "D", xlab = paste( ncol(CAHAffs$data), featlabel), k = 6, ylab = " ")
# E = cahPlotCol(CAHPOS3gr, main = "E", xlab = paste( ncol(CAHPOS3gr$data), featlabel), k = 6, lrect = -12 , ylab = " ")
# F = cahPlotCol(CAHmfw, main = "F", k = 6, lrect = -5, ylab = " ")
# gridExtra::grid.arrange(A, B, C, D, E, F, ncol = 2)
gridExtra::grid.arrange(plotRaw3grams, plotForms, plotAffixes, plotFW, plotLemmas, plotFL, plotPOS3grams, plotGlob, ncol = 2)

gridExtra::grid.arrange(somplotRaw3grams, somplotForms, somplotAffixes, somplotFW, somplotLemmas, somplotFL, somplotPOS3grams, somplotGlob, ncol = 2)

Robustness

cahList = list(raw3grams = CAHRaw3gr, Forms = CAHForms, Affs = CAHAffs, FW = CAHFW, Lemmas = CAHLemmas, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, Global = CAHGlob)
compareHC(cahList, k = 9)
##           raw3grams     Forms      Affs        FW    Lemmas FunctLemm
## raw3grams 1.0000000 0.7966102 0.8474576 0.8474576 0.6949153 0.6949153
## Forms     0.7627119 1.0000000 0.7457627 0.8305085 0.6610169 0.6779661
## Affs      0.8644068 0.7457627 1.0000000 0.7457627 0.6779661 0.7118644
## FW        0.8474576 0.8305085 0.7457627 1.0000000 0.6949153 0.7118644
## Lemmas    0.6949153 0.6271186 0.6271186 0.6779661 1.0000000 0.6949153
## FunctLemm 0.6440678 0.6440678 0.6949153 0.6779661 0.6779661 1.0000000
## POS3gr    0.5932203 0.6610169 0.6440678 0.6440678 0.5932203 0.7118644
## Global    0.8474576 0.7796610 0.8305085 0.7627119 0.6779661 0.6779661
##              POS3gr    Global
## raw3grams 0.6440678 0.8644068
## Forms     0.6610169 0.7627119
## Affs      0.6779661 0.8474576
## FW        0.6779661 0.7796610
## Lemmas    0.6610169 0.6779661
## FunctLemm 0.6949153 0.6440678
## POS3gr    1.0000000 0.6271186
## Global    0.6779661 1.0000000